rng(10)
d = 300;
L = 1;
mu = 0;
gamma = 1;
w_0 = 1e5*rand(d, 1);
iterations = 1000;
format long;

%%%%%%%%%%%%%%%%%%%%%%%%%%% Quasi-Newton %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

x_qn = [0];
y_agd = [1];
y_newton = [1];
y_qn_weak_I = [1];
y_qn_weak_cI = [1];
y_gd = [1];

obj = hard_cubic(L, mu, gamma);
loss = @(x) obj.f(x);
grad = @(x) obj.g(x);
hess = @(x) obj.H(x);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;
w_1 = rand(d, 1);
w_2 = rand(d, 1);
gradient_1 = grad(w_1);
gradient_2 = grad(w_2);
c = dot(gradient_2 - gradient_1, w_2 - w_1)./dot(w_2 - w_1, w_2 - w_1);
H = (1./c)*eye(d);

for iter = 1:iterations
    disp(iter);
    gradient = grad(w);
    step_size = @(eta) loss(w - eta*H*gradient);
    options = optimset('MaxIter',1000);
    lambda = fminsearch(step_size, 1, options);
    w_new = w - lambda*H*gradient;
    gradient_new = grad(w_new);
    s = w_new - w;
    y = gradient_new - gradient;
    t = 1.0/(s'*y);
    G = t*(H*y)*s';
    K = s*s';
    H = H - G' - G + (t^2*(y'*H*y) + t)*K;
    w = w_new;
end

w_opt = w;

alpha = 0.1;
beta = 0.9;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

alpha = 0.1;
w = w_0;

for iter = 1:iterations
    disp(iter);
    gradient = grad(w);
    H = hess(w);
    direction = -H\gradient;
    function_value = loss(w);

    lambda = 1;
    for i = 1:1e5
        if loss(w + lambda*direction) >= function_value + alpha*lambda*direction
            lambda = 0.9*lambda;
        else
            break
        end
    end
    
    w = w - lambda*H\gradient;
    y_newton = [y_newton, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;
H = eye(d);

for iter = 1:iterations
    disp(iter);
    gradient = grad(w);
    function_value = loss(w);
    direction = -H*gradient;
    lambda = 1;
    lambda_max = 1e20;
    lambda_min = 0;
    for i = 1:1e5
        if loss(w + lambda*direction) > function_value + alpha*lambda*direction
            lambda_max = lambda;
            lambda = (lambda_max + lambda_min)/2;
        elseif dot(grad(w + lambda*direction), direction) < beta*dot(gradient, direction)
            lambda_min = lambda;
            if lambda_max == 1e20
                lambda = 2*lambda;
            else
                lambda = (lambda_max + lambda_min)/2;
            end
        else
            break
        end
    end
    w_new = w + lambda*direction;
    gradient_new = grad(w_new);
    s = w_new - w;
    y = gradient_new - gradient;
    t = 1.0/(s'*y);
    G = t*(H*y)*s';
    K = s*s';
    H = H - G' - G + (t^2*(y'*H*y) + t)*K;
    w = w_new;
    x_qn = [x_qn, iter];
    y_qn_weak_I = [y_qn_weak_I, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

w = w_0;
w_1 = rand(d, 1);
w_2 = rand(d, 1);
gradient_1 = grad(w_1);
gradient_2 = grad(w_2);
c = dot(gradient_2 - gradient_1, w_2 - w_1)./dot(w_2 - w_1, w_2 - w_1);
H = (1./c)*eye(d);

for iter = 1:iterations
    disp(iter);
    gradient = grad(w);
    function_value = loss(w);
    direction = -H*gradient;
    lambda = 1;
    lambda_max = 1e20;
    lambda_min = 0;
    for i = 1:1e5
        if loss(w + lambda*direction) > function_value + alpha*lambda*direction
            lambda_max = lambda;
            lambda = (lambda_max + lambda_min)/2;
        elseif dot(grad(w + lambda*direction), direction) < beta*dot(gradient, direction)
            lambda_min = lambda;
            if lambda_max == 1e20
                lambda = 2*lambda;
            else
                lambda = (lambda_max + lambda_min)/2;
            end
        else
            break
        end
    end
    w_new = w + lambda*direction;
    gradient_new = grad(w_new);
    s = w_new - w;
    y = gradient_new - gradient;
    t = 1.0/(s'*y);
    G = t*(H*y)*s';
    K = s*s';
    H = H - G' - G + (t^2*(y'*H*y) + t)*K;
    w = w_new;
    y_qn_weak_cI = [y_qn_weak_cI, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w = w_0;

for iter = 1:iterations
    disp(iter);
    gradient = grad(w);
    step_size = @(eta) loss(w - eta*gradient);
    options = optimset('MaxIter',1000);
    lambda = fminsearch(step_size, 1, options);
    w_new = w - lambda*gradient;
    w = w_new;
    y_gd = [y_gd, (loss(w) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

w_prev = w_0;
w_curr = w_0;
a = 0;
b = 0;

for iter = 1:iterations
    disp(iter);
    z = w_curr + b*(w_curr - w_prev);
    gradient = grad(z);
    step_size = @(eta) loss(z - eta*gradient);
    options = optimset('MaxIter',1000);
    lambda = fminsearch(step_size, 1, options);
    w_new = z - lambda*gradient;
    w_prev = w_curr;
    w_curr = w_new;
    a_prev = a;
    a = (1 + sqrt(1 + 4*a*a))/2;
    b = a_prev/a;
    y_agd = [y_agd, (loss(w_curr) - loss(w_opt))/(loss(w_0) - loss(w_opt))];
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

figure;
semilogy(x_qn, y_qn_weak_I, 'r-*', 'MarkerIndices', 1:300:length(x_qn), 'LineWidth', 3);
grid on
hold on
semilogy(x_qn, y_qn_weak_cI, 'b-*', 'MarkerIndices', 1:300:length(x_qn), 'LineWidth', 3);
semilogy(x_qn, y_gd, 'g-*', 'MarkerIndices', 1:300:length(x_qn), 'LineWidth', 3);
semilogy(x_qn, y_agd, 'k-*', 'MarkerIndices', 1:300:length(x_qn), 'LineWidth', 3);
legend({'BFGS I', 'BFGS cI', 'GD', 'AGD'},'Interpreter','latex','fontsize',20,'Location','southwest')
xlim([0 1000])
ylim([1e-20 1e0])
xticks(0:100:1000)
ax = gca;
ax.FontSize = 15;
xlabel('number of iterations $k$','Interpreter','latex','fontsize',30)
ylabel('$\frac{f(x_k) - f(x_*)}{f(x_0) - f(x_*)}$','Interpreter','latex','fontsize',30)
set(gcf,'position',[0,0,600,400])
hold off